import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import warnings
!pip install --upgrade seaborn matplotlib
Requirement already satisfied: seaborn in c:\users\kanon\anaconda3\lib\site-packages (0.12.2)
Collecting seaborn
Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
------------------------------------ 294.6/294.6 kB 520.8 kB/s eta 0:00:00
Requirement already satisfied: matplotlib in c:\users\kanon\anaconda3\lib\site-packages (3.7.0)
Collecting matplotlib
Downloading matplotlib-3.8.0-cp310-cp310-win_amd64.whl (7.6 MB)
---------------------------- 5.4/7.6 MB 664.8 kB/s eta 0:00:04
ERROR: Wheel 'matplotlib' located at C:\Users\Kanon\AppData\Local\Temp\pip-unpack-d6bdh951\matplotlib-3.8.0-cp310-cp310-win_amd64.whl is invalid.
import pandas as pd
df = pd.read_csv('Mall_Customers.csv')
df = df.dropna()
df
| CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
| ... | ... | ... | ... | ... | ... |
| 195 | 196 | Female | 35 | 120 | 79 |
| 196 | 197 | Female | 45 | 126 | 28 |
| 197 | 198 | Male | 32 | 126 | 74 |
| 198 | 199 | Male | 32 | 137 | 18 |
| 199 | 200 | Male | 30 | 137 | 83 |
200 rows × 5 columns
df.describe()
| CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
| std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
| min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
| 25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
| 50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
| 75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
| max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Genre 200 non-null object 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
sns.distplot(df['Age'])
C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\3255828239.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df['Age'])
<Axes: xlabel='Age', ylabel='Density'>
# distribution plots
plt.figure(figsize = (15, 5))
plotnumber = 1
for col in ['Annual Income (k$)', 'Spending Score (1-100)']:
if plotnumber <= 3:
ax = plt.subplot(1, 3, plotnumber)
sns.distplot(df[col])
plotnumber += 1
plt.tight_layout()
plt.show()
C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\2465352626.py:9: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[col]) C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\2465352626.py:9: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[col])
import pandas as pd
import matplotlib.pyplot as plt
# Sample data (replace with your actual data)
data = {
'Age': [25, 30, 35, 40],
'Count': [20, 30, 15, 10]
}
df = pd.DataFrame(data)
# Extract data for the pie chart
values = df['Count']
labels = df['Age']
explode = (0.1, 0, 0, 0) # Explode the first section for emphasis
colors = ['purple', 'pink', 'green', 'blue']
fig, ax = plt.subplots(figsize=(8, 8), dpi=100)
# Create the pie chart
patches, texts, autotexts = ax.pie(
values,
labels=labels,
autopct='%1.1f%%',
shadow=True,
startangle=90,
explode=explode,
colors=colors,
labeldistance=1.05, # Adjust label distance for separation
pctdistance=0.85, # Adjust percentage label distance
rotatelabels=True, # Rotate the labels for better readability
)
# Customize text colors
plt.setp(texts, size=12, weight='bold', color='black')
plt.setp(autotexts, size=14, weight='bold', color='white')
# Add a title
ax.set_title("Age Distribution")
# Display the pie chart
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
d = pd.read_csv("Mall_Customers.csv")
print (d)
X = d[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
sse = []
for k in range(1, 16):
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
kmeans.fit(X)
sse.append(kmeans.inertia_)
plt.figure(figsize=(10, 5))
plt.plot(range(1, 16), sse, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()
CustomerID Genre Age Annual Income (k$) Spending Score (1-100) 0 1 Male 19 15 39 1 2 Male 21 15 81 2 3 Female 20 16 6 3 4 Female 23 16 77 4 5 Female 31 17 40 .. ... ... ... ... ... 195 196 Female 35 120 79 196 197 Female 45 126 28 197 198 Male 32 126 74 198 199 Male 32 137 18 199 200 Male 30 137 83 [200 rows x 5 columns]
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
km = KMeans(n_clusters=5)
km.fit(X)
y = km.predict(X)
X['label'] = y
X.head()
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\482065266.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X['label'] = y
| Age | Annual Income (k$) | Spending Score (1-100) | label | |
|---|---|---|---|---|
| 0 | 19 | 15 | 39 | 4 |
| 1 | 21 | 15 | 81 | 3 |
| 2 | 20 | 16 | 6 | 4 |
| 3 | 23 | 16 | 77 | 3 |
| 4 | 31 | 17 | 40 | 4 |
import plotly.express as px
import plotly.graph_objs as go
fig = px.scatter_3d(X, x="Annual Income (k$)", y="Spending Score (1-100)", z="Age",
color = 'label', size = 'label')
fig.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import dendrogram, linkage
plt.figure(figsize = (12, 8))
dendo = dendrogram(linkage(X, method = 'ward'))
plt.title('Dendrogram', fontsize = 15)
plt.show()
agc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
labels = agc.fit_predict(X)
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
import matplotlib.pyplot as plt
import numpy as np
# Assuming you have performed hierarchical clustering and have labels assigned to data points
# 'labels' should contain the cluster assignments for each data point
# Create a scatter plot for each cluster
for label in set(labels):
plt.scatter(X[labels == label, 0], X[labels == label, 1],
label=f'Cluster {label + 1}', s=100)
plt.title('Hierarchical Clustering')
plt.xlabel('Annual Income (k$)') # Replace with your actual feature labels
plt.ylabel('Spending Score (1-100)') # Replace with your actual feature labels
plt.legend()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
# Load the CSV data
data = pd.read_csv('Mall_Customers.csv')
# Standardize the features (if necessary)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Create a DBSCAN clustering model
dbscan = DBSCAN(eps=0.3, min_samples=5)
dbscan.fit(X)
# Get the labels assigned to each data point
labels = dbscan.labels_
# Create a scatter plot to visualize the clusters
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='winter')
plt.title('DBSCAN Clustering')
plt.xlabel('Annual Income(k$)')
plt.ylabel('Spending Score(1-100)')
plt.show()